--- layout: post title: "User Location Validation" date: 2020-07-21 18:46:30 -0500 categories: jekyll update ---
What follows is an exploration of the validity of self-reported user location data in tweets. While only around 5% of the decahose have this data in a machine processable format (roughly matching the format city, state, e.g. Burlington, VT), we hope to show that this subset is a reliable proxy for a user's real world location.
We find that generally user self-reported location is reliable with ~80% of user's having a median tweet location within 50km of their self reported cities. While this is only true for user's that actually provide a location which can be fuzzy-string matched to a city, state pair, around 5% of tweets do have this feature.
import os
import sys
import datetime
from dateutil.relativedelta import relativedelta
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import subprocess
from multiprocessing import Pool
sys.path.append(f'{os.getenv("HOME")}/tweet_query/src/')
from itertools import combinations
from tweet_db_query import tweet_connect, ambient_ngrams_dataframe, rank_divergence, extract_timeseries, get_ambient_ngrams, save_ambient_timeseries, top_n_timeseries, assemble_ambient_ngrams
from pymongo import MongoClient
from measurements import compute_pca
import dill
sys.path.append(f'{os.getenv("HOME")}/.passwords')
import mongo_password
sys.path.append(f'{os.getenv("HOME")}/ngram_query')
import mongo_query
from cenpy import products
import cenpy
import geopandas as gpd
import plotly.figure_factory as ff
from pprint import pprint
from matplotlib.lines import Line2D
import warnings
import plotly
import plotly.express as px
import matplotlib
font = {'family' : 'normal',
'weight' :'normal',
'size' : 22}
matplotlib.rc('font', **font)
matplotlib.rcParams['agg.path.chunksize'] = 10000
sys.path.append(f'{os.getenv("HOME")}/tweet_utils/src')
import utils as tweet_utils
sys.path.append(f'{os.getenv("HOME")}/storywrangler/src/')
from regexr import get_emojis_parser, get_ngrams_parser, filter_text, remove_whitespaces, ngrams
import counter
def db_selection(day, high_res=False, ambient='mental_health'):
db = f'tweets_segmented_location'
collection = f"{day.year}"
return db, collection
def tweets_per_day(day, high_res=False, tweets=None):
if tweets is None:
db,collection = db_selection(day, high_res)
tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
query = {'tweet_created_at':{"$gte": day ,"$lt": day+relativedelta(days=+1)}}
return tweets.count_documents(query)
def tweets_per_state(state, year, tweets=None):
day = datetime.datetime(year,1,1)
if tweets is None:
db,collection = db_selection(day)
tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
query = {'state': state}
return tweets.count_documents(query)
def tweets_per_city(city, year, tweets=None):
day = datetime.datetime(year,1,1)
if tweets is None:
db,collection = db_selection(day)
tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
query = {'city_state': {'$regex': f"^{city}"}}
return tweets.count_documents(query)
state_codes = {
'WA': '53', 'DE': '10', 'DC': '11', 'WI': '55', 'WV': '54', 'HI': '15',
'FL': '12', 'WY': '56', 'NJ': '34', 'NM': '35', 'TX': '48',
'LA': '22', 'NC': '37', 'ND': '38', 'NE': '31', 'TN': '47', 'NY': '36',
'PA': '42', 'AK': '02', 'NV': '32', 'NH': '33', 'VA': '51', 'CO': '08',
'CA': '06', 'AL': '01', 'AR': '05', 'VT': '50', 'IL': '17', 'GA': '13',
'IN': '18', 'IA': '19', 'MA': '25', 'AZ': '04', 'ID': '16', 'CT': '09',
'ME': '23', 'MD': '24', 'OK': '40', 'OH': '39', 'UT': '49', 'MO': '29',
'MN': '27', 'MI': '26', 'RI': '44', 'KS': '20', 'MT': '30', 'MS': '28',
'SC': '45', 'KY': '21', 'OR': '41', 'SD': '46'
}
state_pop = {
'WA': '53', 'DE': '10', 'DC': '11', 'WI': '55', 'WV': '54', 'HI': '15',
'FL': '12', 'WY': '56', 'NJ': '34', 'NM': '35', 'TX': '48',
'LA': '22', 'NC': '37', 'ND': '38', 'NE': '31', 'TN': '47', 'NY': '36',
'PA': '42', 'AK': '02', 'NV': '32', 'NH': '33', 'VA': '51', 'CO': '08',
'CA': '06', 'AL': '01', 'AR': '05', 'VT': '50', 'IL': '17', 'GA': '13',
'IN': '18', 'IA': '19', 'MA': '25', 'AZ': '04', 'ID': '16', 'CT': '09',
'ME': '23', 'MD': '24', 'OK': '40', 'OH': '39', 'UT': '49', 'MO': '29',
'MN': '27', 'MI': '26', 'RI': '44', 'KS': '20', 'MT': '30', 'MS': '28',
'SC': '45', 'KY': '21', 'OR': '41', 'SD': '46'
}
us_state_abbrev = {
'AL': 'Alabama',
'AK': 'Alaska',
'AZ': 'Arizona',
'AR': 'Arkansas',
'CA': 'California',
'CO': 'Colorado',
'CT': 'Connecticut',
'DE': 'Delaware',
'FL': 'Florida',
'GA': 'Georgia',
'HI': 'Hawaii',
'ID': 'Idaho',
'IL': 'Illinois',
'IN': 'Indiana',
'IA': 'Iowa',
'KS': 'Kansas',
'KY': 'Kentucky',
'LA': 'Louisiana',
'ME': 'Maine',
'MD': 'Maryland',
'MA': 'Massachusetts',
'MI': 'Michigan',
'MN': 'Minnesota',
'MS': 'Mississippi',
'MO': 'Missouri',
'MT': 'Montana',
'NE': 'Nebraska',
'NV': 'Nevada',
'NH': 'New Hampshire',
'NJ': 'New Jersey',
'NM': 'New Mexico',
'NY': 'New York',
'NC': 'North Carolina',
'ND': 'North Dakota',
'OH': 'Ohio',
'OK': 'Oklahoma',
'OR': 'Oregon',
'PA': 'Pennsylvania',
'RI': 'Rhode Island',
'SC': 'South Carolina',
'SD': 'South Dakota',
'TN': 'Tennessee',
'TX': 'Texas',
'UT': 'Utah',
'VT': 'Vermont',
'VA': 'Virginia',
'WA': 'Washington',
'WV': 'West Virginia',
'WI': 'Wisconsin',
'WY': 'Wyoming',
'DC': "Washington, DC"
}
# get population data
with warnings.catch_warnings():
warnings.simplefilter("ignore")
for key,value in state_codes.items():
state = products.ACS(2017).from_state(us_state_abbrev[key], level='county',
variables='B01001_001E')
state_pop[key] = state['B01001_001E'].sum()
# get tweet count data
for key,value in state_codes.items():
tweet_count = tweets_per_state(key,2009)
state_codes[key] = tweet_count
# save data
df = pd.DataFrame([(key, state_pop[key],state_codes[key]) for key,value in state_codes.items()],columns=['State','Population','Tweets'])
df.to_csv('tweet_data')
from plotly.offline import init_notebook_mode
init_notebook_mode(connected = False)
df1 = pd.read_csv('tweet_data')
fig = px.scatter(df1, x="Population", y="Tweets",
hover_data=['State'],title='User Location by State: 2009')
fig.update_traces( marker=dict(size=9,
line=dict(width=2,
color='DarkSlateGrey')))
fig.update_layout(xaxis_type="log", yaxis_type="log")
plotly.offline.plot(fig, filename='tweets_per_state')
fig.show()
We can see that states with larger populations also tend to have a higher number of tweets per capita.
df1['tweets_per_capita'] = df1['Tweets']/df1['Population']
fig = px.scatter(df1, x="Population", y='tweets_per_capita',
hover_data=['State'],title='User Location by State: 2009')
fig.update_traces( marker=dict(size=9,
line=dict(width=2,
color='DarkSlateGrey')))
fig.update_layout(xaxis_type="log", yaxis_type="log")
plotly.offline.plot(fig, filename='tweets_per_captia')
fig.show()
# merge data with shapefile
gdf = gpd.read_file('/home/michael/Downloads/state_shapefile/states.dbf')
gdf = gdf.merge(df1, left_on='STATE_ABBR',right_on='State')
# plot
figsize=(20,9)
gdf[gdf['State']!='DC'].plot(column='tweets_per_capita',figsize=figsize, legend=True, cmap='Blues')
plt.title('Tweets Per Capita: 2009')
plt.axis('off')
Now that we have a tool to query tweets by state, let's map the average happiness by state.
import pathlib
import counter
from tweet_db_query import *
from sentiment import *
def get_ngrams_by_state(state, scheme=1, lang='en', collection='2009', database='tweets_segmented_location'):
"""function to query n-gram counts by fuzzy string matched user-provided location
:param state: state to query
:param scheme: length of n-gram to parse
:param lang: language to query
:param collection: mongo collection to query
:param database: mongo database to query
:return: a counter object of n-grams
"""
ngrams_pth = f'../../ngrams.bin'
ngrams_parser = get_ngrams_parser(ngrams_pth)
tweets = tweet_connect('guest','roboctopus', collection=collection, database=database)
counter_i = counter.NgramCounter({})
for t in tweets.find({'state':state}):
counter_i += parse_ngrams_tweet(t, ngrams_parser, scheme)
return counter_i
# load labMT happiness dictionaries
happ_dict = {}
happs = pd.read_csv('/home/michael/labMT2english.txt',sep='\t')
happs = happs.set_index('word')
word2score_ref = happs['happs'].to_dict()
# query tweets by state and score sentiment
for state in gdf['State']:
counter_i = get_ngrams_by_state(state)
sentiment_value_i = counter_sentiment(counter_i, word2score_ref)
happ_dict[state] = sentiment_value_i
# Only run if necessary. querying takes a long time.
rerun = False
if rerun:
gdf.to_file('2009_states_happiness')
# reload file: Run
gdf = gpd.read_file('2009_states_happiness')
figsize=(30,7)
gdf[gdf['State']!='DC'].plot(column='happs',figsize=figsize, legend=True, cmap='Blues')
plt.title('Tweet Sentiment by State: 2009')
plt.axis('off')
One major upgrade made possible by the Mongo database is querying for tweets containing anchor words. Using aggregations we can now quickly get timeseries of the number of tweets containing a word in a city or state
from uszipcode import SearchEngine
search = SearchEngine(simple_zipcode=False)
zipcodes = search.by_state(state='New York',zipcode_type=None, returns=10000)
# count population of New York
x = 0
for i in zipcodes:
if i.population:
x += i.population
x
def get_city_mentions_by_city(word, scheme=1, lang='en', collection='2009', database='tweets_segmented_location'):
"""Aggregate mentions of a word by city
:param word: ambient anchor word
:param dates: a pandas DatetimeIndex array
:param scheme: length of n-gram to parse
:param lang: language to query
:param collection: mongo collection to query
:return: a counter object of n-grams
"""
ngrams_pth = f'../../ngrams.bin'
ngrams_parser = get_ngrams_parser(ngrams_pth)
tweets = tweet_connect('guest','roboctopus', collection=collection, database=database)
for t in tweets.aggregate([{'$match': {'$text':{'$search': f'\"{word}\"'}}},
{'$group':{
"_id":{
'city_state': "$city_state"
},
'count': {"$sum":1}
}}
]):
yield t
def get_weekly_mentions_by_city(word, scheme=1, lang='en', collection='2009', database='tweets_segmented_location'):
"""Aggregate mentions of a word by city
:param word: ambient anchor word
:param dates: a pandas DatetimeIndex array
:param scheme: length of n-gram to parse
:param lang: language to query
:param collection: mongo collection to query
:return: a counter object of n-grams
"""
ngrams_pth = f'../../ngrams.bin'
ngrams_parser = get_ngrams_parser(ngrams_pth)
tweets = tweet_connect('guest','roboctopus', collection=collection, database=database)
for t in tweets.aggregate([{'$match': {'$text':{'$search': f'\"{word}\"'}}},
{'$group':{
"_id":{
'city_state': "$city_state",
'week': {'$week':'$tweet_created_at'}
},
'count': {"$sum":1}
}}
]):
yield t
def get_language_by_city(city='New York, NY', collection='2009', database='tweets_segmented_location'):
"""Aggregate languages by city
:param city: A city name in {city, state abbr} format
:param collection: mongo collection to query
:return: a counter object of n-grams
"""
ngrams_pth = f'../../ngrams.bin'
ngrams_parser = get_ngrams_parser(ngrams_pth)
tweets = tweet_connect('guest','roboctopus', collection=collection, database=database)
for t in tweets.aggregate([{'$match': {'city_state':city}},
{'$group':{
"_id":{
'fastText_lang': "$fastText_lang"
},
'count': {"$sum":1}
}}
]):
print(t)
yield t
def plot_mentions(x, city='New York, NY', word='coronavirus', ax=None):
""" Function to plot mentions of anchor word"""
x1 = [i for i in sorted(x, key = lambda i: i['count'], reverse=True) if i['_id']['city_state']==city]
weeks = []
counts = []
for week in x1:
weeks.append(week['_id']['week'])
counts.append(week['count'])
weeks2 = pd.date_range(datetime.datetime(2020,1,1),datetime.datetime(2020,3,25),freq='w')
if ax is None:
f,ax = plt.subplots(figsize=(8,6))
ax.plot(weeks,counts,'o')
ax.set_title(city)
ax.set_xlabel('Week of the Year')
#ax.set_ylabel(f'Tweets with "{word}"')
def city_name(name):
try:
return name.split(',')[0]
except:
return None
pass
Let's see how attention to the coronavirus evolved in time in some major US cities.
word = 'coronavirus'
x = [i for i in get_weekly_mentions_by_city(word, collection='2020')]
x[:10]
cities = ['Los Angeles, CA',
'New York, NY',
'Houston, TX',
'Washington, DC',
'Chicago, IL',
'Atlanta, GA',
'Dallas, TX',
'Miami, FL',
'New Middletown, IN',
'Austin, TX',
'Boston, MA',
'Seattle, WA',
'Brooklyn, NY',
'San Francisco, CA',
'San Diego, CA',
'Las Vegas, NV',
'Philadelphia, PA',
'San Antonio, TX',
'Portland, OR',
'Phoenix, AZ',
'Denver, CO',
'Orlando, FL',
'Hanna, UT',
'Baltimore, MD',
'Pittsburgh, PA',]
f,ax = plt.subplots(4,5,figsize=(19,15),sharey=True,sharex=True)
ax = ax.ravel()
for i,city in enumerate(cities[:20]):
plot_mentions(x,city=city, ax=ax[i])
plt.tight_layout()
plt.suptitle(f'"{word}" Mentions ', y=1.04, fontsize=36)
Some obvious next steps here would be to correct for population density. With the full year it would be interesting to see how attention scales with recent case numbers, and it different cities have maximum attention at different times.
import networkx as nx
cities = ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix',
'Philadelphia', 'San Antonio', 'San Diego', 'Dallas', 'San Jose',
'Austin', 'Jacksonville', 'Fort Worth', 'San Francisco',
'Columbus', 'Charlotte', 'Indianapolis', 'Seattle', 'Denver',
'Washington Dc', 'El Paso', 'Boston', 'Nashville', 'Nashville',
'Portland', 'Las Vegas', 'Detroit', 'Oklahoma City', 'Memphis',
'Louisville', 'Louisville', 'Baltimore', 'Milwaukee',
'Albuquerque', 'Tucson', 'Fresno', 'Sacramento', 'Mesa', 'Atlanta',
'Kansas City', 'Colorado Springs', 'Miami', 'Raleigh',
'Long Beach', 'Virginia Beach', 'Omaha', 'Oakland', 'Minneapolis',
'Arlington', 'Tampa', 'Tulsa', 'New Orleans', 'Wichita',
'Bakersfield', 'Cleveland', 'Aurora', 'Anaheim', 'Honolulu',
'Riverside', 'Santa Ana', 'Lexington', 'Lexington',
'Corpus Christi', 'Henderson', 'Stockton', 'St. Paul',
'Pittsburgh', 'St. Louis', 'Cincinnati', 'Anchorage', 'Orlando',
'Irvine', 'Plano', 'Greensboro', 'Lincoln', 'Newark', 'Durham',
'Toledo', 'St. Petersburg', 'Chula Vista', 'Fort Wayne',
'Scottsdale', 'Jersey City', 'Laredo', 'Madison', 'Lubbock',
'Reno', 'Chandler', 'Glendale', 'Buffalo', 'North Las Vegas',
'Gilbert', 'Winston Salem', 'Chesapeake', 'Irving', 'Norfolk',
'Fremont', 'Hialeah', 'Richmond', 'Boise', 'Boise', 'Garland',
'Baton Rouge', 'Spokane', 'Tacoma', 'Modesto', 'San Bernardino',
'Fontana', 'Des Moines', 'Oxnard', 'Moreno Valley', 'Birmingham',
'Fayetteville', 'Rochester', 'Amarillo', 'Port St. Lucie',
'Yonkers', 'Mckinney', 'Grand Prairie', 'Salt Lake City',
'Grand Rapids', 'Little Rock', 'Huntsville', 'Huntington Beach',
'Augusta', 'Augusta', 'Overland Park', 'Montgomery', 'Tempe',
'Akron', 'Cape Coral', 'Tallahassee', 'Frisco', 'Mobile',
'Knoxville', 'Shreveport', 'Brownsville', 'Worcester',
'Santa Clarita', 'Sioux Falls', 'Fort Lauderdale', 'Vancouver',
'Rancho Cucamonga', 'Chattanooga', 'Newport News', 'Ontario',
'Providence', 'Elk Grove', 'Salem', 'Oceanside', 'Santa Rosa',
'Corona', 'Eugene', 'Garden Grove', 'Peoria', 'Pembroke Pines',
'Fort Collins', 'Cary', 'Springfield', 'Jackson', 'Alexandria',
'Hayward', 'Hollywood', 'Lakewood', 'Lancaster', 'Salinas',
'Sunnyvale', 'Palmdale', 'Clarksville', 'Escondido', 'Pomona',
'Pasadena', 'Killeen', 'Macon Bibb County', 'Joliet',
'Murfreesboro', 'Mcallen', 'Savannah', 'Naperville', 'Paterson',
'Thornton', 'Bellevue', 'Torrance', 'Rockford', 'Miramar',
'Bridgeport', 'Mesquite', 'Fullerton', 'Denton', 'Waco',
'Syracuse', 'Roseville', 'Orange', 'Surprise', 'Dayton',
'Charleston', 'Olathe', 'Midland', 'West Valley City',
'Gainesville']
G = nx.DiGraph()
for city in cities:
city_dict = [G.add_edge(city_name(i['_id']['city_state']),city, weight=i['count']) for i in get_city_mentions_by_city(city)]
[(city_name(i['_id']['city_state']), i['count']) for i in get_city_mentions_by_city('Rome')]
len(G.edges())
G1 = G.subgraph(cities[:80:5])
weights = [np.sqrt(G[u][v]['weight'])/2 for u,v in G1.edges()]
f,ax = plt.subplots(figsize=(20,20))
nx.draw(G1,pos=nx.kamada_kawai_layout(G1,weight=None), ax=ax,with_labels=True,node_size=2500,width=weights)
degree_sequence = sorted([d for n, d in G.degree()], reverse=True) # degree sequence
# print "Degree sequence", degree_sequence
degreeCount = collections.Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())
fig, ax = plt.subplots(figsize=(10,10))
plt.plot(deg, cnt,'o', color='b')
plt.yscale('log')
plt.xscale('log')
plt.title("Degree Histogram")
plt.ylabel("Count")
plt.xlabel("Degree")
cities = pd.read_csv('~/city_fame/data/cities',header=None)
population = pd.read_csv('~/city_fame/data/population',header=None, names=['Population'])
population.index = cities[0].values
#data = city_fame(population,np.median)
population['Tweets'] = 0
population.index
for city in population.index:
tweet_count = tweets_per_city(city,2009)
population['Tweets'][city] = tweet_count
fig = px.scatter(population, x="Population", y="Tweets",
hover_data=[population.index],title='User Location by City: 2009')
fig.update_traces( marker=dict(size=9,
line=dict(width=2,
color='DarkSlateGrey')))
fig.update_layout(xaxis_type="log", yaxis_type="log")
plotly.offline.plot(fig, filename='tweets_per_state')
fig.show()
def get_user_loc(args):
""" Check the tweet object and based on the user bio location return a
state of origin
:param args: (JSON object, uszipcode SearchEngine)
:return: string with "City,State" tweeter is from
NOTE: if the tweet HAS NO LOCATION (doesn't exist, isn't in US, not in
correct format, etc) this will return "None"
also, has_loc returns True if there's a given location, but we can't classify it-
I was using this for some sanity testing
"""
tweet, location_searcher = args
has_loc = False
if 'user' in tweet:
if 'location' in tweet['user']:
if tweet['user']['location']!=None:
user_location_text = tweet['user']['location'].split(',')
if len(user_location_text) == 2:
try:
user_location = location_searcher.by_city_and_state(user_location_text[0].strip(),\
user_location_text[1].strip())[0]
except (ValueError, IndexError, KeyError):
pass
else:
has_loc = True
elif 'location' in tweet['actor']:
if tweet['actor']['location']!=None:
user_location_text = tweet['actor']['location']['displayName'].split(',')
if len(user_location_text) == 2:
try:
user_location = location_searcher.by_city_and_state(user_location_text[0].strip(),\
user_location_text[1].strip())[0]
except (ValueError, IndexError, KeyError):
pass
else:
has_loc = True
if has_loc:
city_state = f"{user_location.major_city}, {user_location.state}"
state = user_location.state
tweet['city_state'] = city_state
tweet['state'] = state
return user_location.lng, user_location.lat
def get_loc(args):
""" Check the tweet object and based on the user bio location return a
state of origin
:param args: (JSON object, uszipcode SearchEngine)
:return: string with "City,State" tweeter is from
NOTE: if the tweet HAS NO LOCATION (doesn't exist, isn't in US, not in
correct format, etc) this will return "None"
also, has_loc returns True if there's a given location, but we can't classify it-
I was using this for some sanity testing
"""
tweet, location_searcher = args
has_loc = False
if 'user' in tweet:
if 'location' in tweet['user']:
if tweet['user']['location']!=None:
user_location_text = tweet['user']['location'].split(',')
if len(user_location_text) == 2:
try:
user_location = location_searcher.by_city_and_state(user_location_text[0].strip(),\
user_location_text[1].strip())[0]
except (ValueError, IndexError, KeyError):
pass
else:
has_loc = True
elif 'location' in tweet['actor']:
if tweet['actor']['location']!=None:
user_location_text = tweet['actor']['location']['displayName'].split(',')
if len(user_location_text) == 2:
try:
user_location = location_searcher.by_city_and_state(user_location_text[0].strip(),\
user_location_text[1].strip())[0]
except (ValueError, IndexError, KeyError):
pass
else:
has_loc = True
if has_loc:
city_state = f"{user_location.major_city}, {user_location.state}"
state = user_location.state
tweet['city_state'] = city_state
tweet['state'] = state
return user_location.lng, user_location.lat
from uszipcode import SearchEngine
from shapely.geometry import Point, LineString
location_search = SearchEngine()
db = 'tweets'
collection='geotweets'
tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
user_loc_list = []
loc_list = []
t_list = []
begin = datetime.datetime(2011,1,1)
end = datetime.datetime(2012,1,1)
query= {'tweet_created_at': {'$gte':begin,'$lt':end}}
for t in tweets.find(query,limit=2):
user_loc = get_user_loc((t,location_search))
if user_loc and user_loc[0]:
print(get_user_loc((t,location_search)))
user_loc_list.append(Point(get_user_loc((t,location_search))))
loc_list.append(Point(t['geo']['coordinates']))
t_list.append(t)
user_loc_gdf = gpd.GeoDataFrame(geometry=user_loc_list,crs={'init':'epsg:4326'})
loc_gdf = gpd.GeoDataFrame(geometry=loc_list,crs={'init':'epsg:4326'})
user_loc_gdf.to_crs(epsg=3310,inplace=True)
loc_gdf.to_crs(epsg=3310,inplace=True)
dist = user_loc_gdf.distance(loc_gdf)/1000
bins = np.logspace(0,5,20)
ax = dist.hist(bins=bins)
ax.set_xscale('log')
ax.set_xlabel('Distance from Tweet to User\'s given Home \n[km]')
f = plt.figure(facecolor='w')
ax = dist.hist(bins=bins,density=False,figsize=(12,8),grid=False)
ax.set_xscale('log')
ax.set_xlabel('Distance from Tweet to User\'s given Home \n[km]')
ax.set_title('2011')
ax = dist.hist(bins=bins,density=False,figsize=(12,8),grid=False)
ax.set_xscale('log')
ax.set_xlabel('Distance from Tweet to User\'s given Home \n[km]')
ax.set_title('2010')
ax = dist.hist(bins=bins,density=False,figsize=(12,8),grid=False)
ax.set_xscale('log')
ax.set_xlabel('Distance from Tweet to User\'s given Home \n[km]')
ax.set_title('2009')
user_ids = []
n = 100000
begin = datetime.datetime(2010,1,1)
end = datetime.datetime(2014,1,1)
query= {'tweet_created_at': {'$gte':begin,'$lt':end}}
for t in tweets.find(limit=n):
user_ids.append(t['user']['id'])
user_ids = set(user_ids)
N = len(user_ids)
# multiprocess user info
def get_user_loc_info(user_id):
user_dict = {}
db = 'tweets'
collection='geotweets'
tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
# get example tweet # assumes user account location doesn't change
query = {'user.id':user_id}
t = tweets.find_one(query)
# find lat long of User defined account location
location_search = SearchEngine()
user_loc = get_user_loc((t,location_search))
if user_loc and user_loc[0]:
user_dict['profile_location'] = user_loc
query = {'user.id':user_id}
user_dict['lat'] = []
user_dict['lng'] = []
for t in tweets.find(query,limit=1000):
user_dict['tweet_created_at'] = t['tweet_created_at']
user_dict['lat'].append(t['geo']['coordinates'][0])
user_dict['lng'].append(t['geo']['coordinates'][1])
return user_dict
else:
return None
def get_user_loc_info_local(user_id):
user_dict = {}
query = {'user.id':user_id}
t = tweets.find_one(query)
# find lat long of User defined account location
location_search = SearchEngine()
user_loc = get_user_loc((t,location_search))
if user_loc and user_loc[0]:
user_dict['profile_location'] = user_loc
user_dict['lat'] = []
user_dict['lng'] = []
user_dict['tweet_created_at'] = []
for t in tweets.find(query,limit=1000):
user_dict['tweet_created_at'].append(t['tweet_created_at'])
user_dict['lat'].append(t['geo']['coordinates'][0])
user_dict['lng'].append(t['geo']['coordinates'][1])
return user_dict
else:
return None
def user_loc_change(user_id):
query = {'user.id':user_id}
user_loc = ''
print(user_id,user_loc)
counter = 0
for t in tweets.find(query,limit=1000).sort('tweet_created_at',1):
new_loc = t['user']['location']
if user_loc != new_loc:
print(user_id,new_loc, user_loc,t['tweet_created_at'])
counter += 1
user_loc = new_loc
return counter
# does user location change?
db = 'tweets'
collection='geotweets'
tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
# get example tweet # assumes user account location doesn't change
location_search = SearchEngine()
user_info_list = list(map(user_loc_change, user_ids))
f,ax = plt.subplots(figsize=(12,6),facecolor='w')
plt.hist(user_info_list,bins=30)
#ax.set_yscale('log')
plt.xlabel(' Number of Self-reported User Location Updates')
plt.ylabel('Number of Users')
db = 'tweets'
collection='geotweets'
tweets = tweet_connect('guest', 'roboctopus', database=db, collection=collection)
# get example tweet # assumes user account location doesn't change
location_search = SearchEngine()
user_info_list = list(map(get_user_loc_info_local, user_ids))
print(n,N,len([i for i in user_info_list if i is not None]))
# filter out users with no matched account location
user_loc_dict = []
for key,value in zip(user_ids,user_info_list):
if value is not None:
for lat,lng,time_stamp in zip(value['lat'],value['lng'],value['tweet_created_at']):
user_loc_dict.append( {'profile_location':Point(value['profile_location']),
'user_id':key,
'lat':lat,
'lng':lng,
'tweet_created_at':time_stamp})
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
df = pd.DataFrame(user_loc_dict)
gdf = gpd.GeoDataFrame(df,geometry=gpd.points_from_xy(df.lat, df.lng),crs={'init':'epsg:4326'})
gdf['profile_location'] = gpd.GeoSeries(gdf['profile_location'])
ax = world.plot(color='white', edgecolor='black',figsize=(20,8))
# We can now plot our ``GeoDataFrame``.
gdf.plot(ax=ax, color='red',alpha=0.1,markersize=0.5)
plt.show()
from math import radians, cos, sin, asin, sqrt
def haversine(lon1, lat1, lon2, lat2):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
# convert decimal degrees to radians
lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
# haversine formula
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6371 # Radius of earth in miles 3956. Use 6371 for kilometers
return c * r
def distance_wrapper(point1,point2):
return haversine(point1.x, point1.y, point2.x, point2.y)
gdf['distance'] = gdf.apply(lambda x: distance_wrapper(x['geometry'],x['profile_location']),axis=1)
gdf['line'] = gdf.apply(lambda x: LineString([[x['geometry'].x,x['geometry'].y],
[x['profile_location'].x,x['profile_location'].y]]),
axis=1)
gdf['line'] = gpd.GeoSeries(gdf['line'])
distance_list = []
date_list = []
for user_id in gdf['user_id'].unique():
gdf_uix = gdf[gdf['user_id']==user_id].centroid.x.median()
gdf_uiy = gdf[gdf['user_id']==user_id].centroid.y.median()
point = Point(gdf_uix,gdf_uiy)
distance = distance_wrapper(gdf[gdf['user_id']==user_id]['profile_location'].values[0],
point)
distance_list.append(distance)
date = gdf[gdf['user_id']==user_id]['tweet_created_at'].mean()
date_list.append(date.year+(date.month/12))
gdf[gdf['user_id']==15073985][gdf['lat']<-74]
bins = np.logspace(0,4.5,20)
binsy = np.linspace(2009,2020,12*4)
f,ax = plt.subplots(figsize=(12,7),facecolor='w')
cbar = ax.hist2d(distance_list,date_list,bins=[bins,binsy],density=False)
ax.set_xscale('log')
ax.set_xlabel('Distance between Median phone (GPS) & \n Self-reported location (bio) \n [km]')
plt.colorbar(cbar[3])
bins = np.logspace(0,4.5,20)
f,ax = plt.subplots(figsize=(12,7),facecolor='w')
ax.hist(distance_list,bins=bins,density=False)
ax.set_xscale('log')
ax.set_xlabel('Distance between Median phone (GPS) & \n Self-reported location (bio) \n [km]')
distance_array = np.array(distance_list)
account_total = np.count_nonzero(distance_array)
dist_list = [1,5,10,13,50,100,200,500]
for dist in dist_list:
accounts_gt = np.count_nonzero(distance_array[distance_array < dist])
print(f"{(accounts_gt*100/account_total):.1f}% user's median tweet location within {dist}km of self-reported account location")
While a median location is given above, here we show the tweet level distribution. Unfortunately the user generated location is not re-queried at each step leading to a potentially large source of error if users move and update their bio location frequently.
bins = np.logspace(0,4.5,20)
ax =gdf['distance'].hist(bins=bins,density=False,figsize=(12,8),grid=False)
ax.set_xscale('log')
ax.set_xlabel('Distance from Tweet to User\'s given Home \n[km]')
bins = np.logspace(0,4.5,50)
binsy = np.linspace(2009,2020,12*6)
f,ax = plt.subplots(figsize=(12,7),facecolor='w')
dates = np.array([i.year for i in gdf['tweet_created_at']])+np.array([(i.month-1)/12 for i in gdf['tweet_created_at']])
cbar = ax.hist2d(gdf['distance'],dates,bins=[bins,binsy],density=False)
ax.set_xscale('log')
ax.set_xlabel('Distance between phone location (GPS) & \n Self-reported location (bio) \n [km]')
plt.colorbar(cbar[3])